Import helpers and packages


In [33]:
import pandas as pd
from autoc import DataExploration, PreProcessor, NaImputer
from autoc.utils.getdata import get_dataset
import numpy as np

# skicit learn 
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_curve, accuracy_score, auc, classification_report

# matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

Playing with titanic data

Two approaches :

  • Simulate missing data and look if we can impute them correctly using machine learning algo
  • Impute the real ones and look if it is improving performance on the prediction

In [34]:
titanic = get_dataset("titanic")

In [35]:
titanic.who.dtype.kind == 'O'


Out[35]:
True

In [36]:
titanic.head()


Out[36]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35 0 0 8.0500 S Third man True NaN Southampton no True

In [37]:
exploration_titanic = DataExploration(titanic)

In [38]:
exploration_titanic.print_infos() # there is duplicates here because no id, interesting !


{'duplicated_rows': {'action': 'delete',
                     'comment': 'You should delete this rows with df.drop_duplicates()',
                     'level': 'ERROR',
                     'value': Int64Index([ 47,  76,  77,  87,  95, 101, 121, 133, 173, 196,
            ...
            838, 844, 846, 859, 863, 870, 877, 878, 884, 886],
           dtype='int64', length=107)}}

In [39]:
exploration_titanic.nacolcount()


Out[39]:
Nanumber Napercentage
survived 0 0.000000
pclass 0 0.000000
sex 0 0.000000
age 177 0.198653
sibsp 0 0.000000
parch 0 0.000000
fare 0 0.000000
embarked 2 0.002245
class 0 0.000000
who 0 0.000000
adult_male 0 0.000000
deck 688 0.772166
embark_town 2 0.002245
alive 0 0.000000
alone 0 0.000000

In [40]:
exploration_titanic.structure()


Out[40]:
dtypes_p dtypes_r nb_missing perc_missing nb_unique_values constant_columns na_columns is_key dtype_infer string_length
survived int64 numeric 0 0.000000 2 False False False integer NaN
pclass int64 numeric 0 0.000000 3 False False False integer NaN
sex object factor 0 0.000000 2 False False False string 6
age float64 numeric 177 0.198653 88 False False False floating NaN
sibsp int64 numeric 0 0.000000 7 False False False integer NaN
parch int64 numeric 0 0.000000 7 False False False integer NaN
fare float64 numeric 0 0.000000 248 False False False floating NaN
embarked object factor 2 0.002245 3 False False False mixed 1
class object factor 0 0.000000 3 False False False string 6
who object factor 0 0.000000 3 False False False string 5
adult_male bool factor 0 0.000000 2 False False False boolean NaN
deck object factor 688 0.772166 7 False False False mixed 1
embark_town object factor 2 0.002245 3 False False False mixed 11
alive object factor 0 0.000000 2 False False False string 3
alone bool factor 0 0.000000 2 False False False boolean NaN

In [41]:
titanic.corr()


Out[41]:
survived pclass age sibsp parch fare adult_male alone
survived 1.000000 -0.338481 -0.077221 -0.035322 0.081629 0.257307 -0.557080 -0.203367
pclass -0.338481 1.000000 -0.369226 0.083081 0.018443 -0.549500 0.094035 0.135207
age -0.077221 -0.369226 1.000000 -0.308247 -0.189119 0.096067 0.280328 0.198270
sibsp -0.035322 0.083081 -0.308247 1.000000 0.414838 0.159651 -0.253586 -0.584471
parch 0.081629 0.018443 -0.189119 0.414838 1.000000 0.216225 -0.349943 -0.583398
fare 0.257307 -0.549500 0.096067 0.159651 0.216225 1.000000 -0.182024 -0.271832
adult_male -0.557080 0.094035 0.280328 -0.253586 -0.349943 -0.182024 1.000000 0.404744
alone -0.203367 0.135207 0.198270 -0.584471 -0.583398 -0.271832 0.404744 1.000000

In [42]:
titanic.loc[titanic.age.isnull(),:].head(5)


Out[42]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
5 0 3 male NaN 0 0 8.4583 Q Third man True NaN Queenstown no True
17 1 2 male NaN 0 0 13.0000 S Second man True NaN Southampton yes True
19 1 3 female NaN 0 0 7.2250 C Third woman False NaN Cherbourg yes True
26 0 3 male NaN 0 0 7.2250 C Third man True NaN Cherbourg no True
28 1 3 female NaN 0 0 7.8792 Q Third woman False NaN Queenstown yes True

Preprocessing data


In [43]:
preprocessor = PreProcessor(titanic)

In [44]:
preprocessor.infer_subtypes()


Out[44]:
{'adult_male': {'dtype': dtype('bool'), 'subtype': 'binary'},
 'age': {'dtype': dtype('float64'), 'subtype': None},
 'alive': {'dtype': dtype('O'), 'subtype': 'binary'},
 'alone': {'dtype': dtype('bool'), 'subtype': 'binary'},
 'class': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'deck': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'embark_town': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'embarked': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'fare': {'dtype': dtype('float64'), 'subtype': None},
 'parch': {'dtype': dtype('int64'), 'subtype': 'ordinal'},
 'pclass': {'dtype': dtype('int64'), 'subtype': 'ordinal'},
 'sex': {'dtype': dtype('O'), 'subtype': 'binary'},
 'sibsp': {'dtype': dtype('int64'), 'subtype': 'ordinal'},
 'survived': {'dtype': dtype('int64'), 'subtype': 'binary'},
 'who': {'dtype': dtype('O'), 'subtype': 'text_categorical'}}

In [45]:
titanic = titanic.drop('alive', axis = 1)

Transform everything to numeric variables for skicit learn model

The dataset is not clean enough to be directly transformed as a numpy array and used for ml with skicit learn

  • The target variable (alive) is not a a integer dummy variable (0 or 1)
  • The values are heterogeneous (string labels for categories, integers and floating point numbers). Numpy array for skicit learn only supports integers and floating point numbers.
  • Some observations are missing (np.nan)

In [46]:
features_full = pd.concat([titanic.loc[:, ['fare', 'age', 'pclass', 'sibsp', 'parch']],
                      pd.get_dummies(titanic['sex'], prefix='sex'),
                      pd.get_dummies(titanic['who'], prefix='who'),
                      pd.get_dummies(titanic['alone'], prefix='alone'),
                      pd.get_dummies(titanic['embarked'], prefix='embarked')],
                     axis=1)

In [47]:
features = pd.concat([titanic[['fare', 'age', 'pclass']],
                      pd.get_dummies(titanic['sex'], prefix='sex'),
                      pd.get_dummies(titanic['who'], prefix='who'),
                      pd.get_dummies(titanic['embarked'], prefix='embarked')],
                     axis=1)

In [48]:
target = titanic.survived

Simple Model to understand variable importances

Prepare data

In [49]:
# Impute missing values 
imp = NaImputer(features_full)
features_full = imp.basic_naimputation(['age']) # this is still a pandas Dataframe but imputed 
target = titanic.survived

In [50]:
# Creating test train 
features_train, features_test, target_train, target_test = train_test_split(
    features_full.values, target.values, test_size=0.25, random_state=0)
Logistic Regression

In [51]:
logreg = LogisticRegression(C=1)
logreg.fit(features_train, target_train)
target_pred = logreg.predict(features_test)


feature_names = features_full.columns
print("Accuracy : {}".format(accuracy_score(target_test, target_pred)))
weights = logreg.coef_.flatten()
dict_weights = {k:v for k,v in zip(feature_names, weights)}


Accuracy : 0.811659192825

In [52]:
def plot_simple_imp(imp, features_names,sort = True, absolute=False):
    serie = pd.Series(index=feature_names, data=imp)
    if absolute : 
        serie = np.abs(serie)
    if sort :
        serie.sort_values(inplace=True, ascending=False)
    serie.plot(kind='barh')

In [53]:
plot_simple_imp(weights, feature_names)



In [ ]:


In [54]:
# Looking at weights 
feature_names = features_full.columns

def plot_abs_weights(coeff_arr, features_name, title=None,legend_size=12,figsize=(15,7)):
    coeff_arr = np.abs(coeff_arr)# take absolute value
    coeff_arr.sort()
    plt.figure(figsize=figsize)
    plt.barh(range(len(feature_names)), coeff_arr)
    plt.yticks(range(len(feature_names)), feature_names, size=legend_size)
    if title:
        plt.title(title)

In [55]:
plot_abs_weights(logreg.coef_.ravel(), feature_names, title="Absolute Coefficient Logistic Regression")


Random Forest

In [56]:
rf_full = RandomForestClassifier(n_estimators=500)
rf_full.fit(features_train, target_train)


Out[56]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [57]:
rf_full.score(features_test, target_test)


Out[57]:
0.81165919282511212

In [58]:
plot_simple_imp(rf_full.feature_importances_, feature_names)


Studying simple imputation technique for accuracy performance

Here we are trying to predict who survives using with the variable age having natural missing values

  • Deletion of row containing missing values
  • Deletion of col containig missing values
  • Imputation of missing values with median
  • Imputation of missing values with -1

In [130]:
def rf_cv(features, target,random_state=1, n_estimators=200,scoring='accuracy',n_jobs=4, verbose=True):
    """ Print scores of a random forest cross validation  """
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    scores = cross_val_score(rf, features, target, cv=4,scoring=scoring,n_jobs=4)
    if verbose :
        print("Random Forest CV scores:min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
                scores.min(), scores.mean(), scores.max()))
    return scores

def logreg_cv(features, target, scoring='accuracy',n_jobs=4, verbose=True):
    """ Print scores of a  forest cross validation  """
    logreg = LogisticRegression(C=1)
    scores = cross_val_score(logreg, features, target, cv=4,scoring=scoring,n_jobs=4)
    if verbose : 
        print("Logistic Regression CV scores: min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
                scores.min(), scores.mean(), scores.max()))
    return scores
    

def plot_roc_curve(target_test, target_predicted_proba):
    fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1])
    
    roc_auc = auc(fpr, tpr)
    # Plot ROC curve
    plt.plot(fpr, tpr, label='ROC curve (AUC = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate or (1 - Specifity)')
    plt.ylabel('True Positive Rate or (Sensitivity)')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")

Row Deletion strategy


In [131]:
# selecting index and transforming into numpy array 
index_missing_age = features.age.isnull()
features_rm_a, target_rm_a = features.loc[~index_missing_age, :].values, target[~index_missing_age].values

In [132]:
features_rm_a.shape


Out[132]:
(714, 11)

In [133]:
rf_cv(features_rm_a, target_rm_a, scoring='accuracy')


Random Forest CV scores:min: 0.726, mean: 0.784, max: 0.816
Out[133]:
array([ 0.72625698,  0.81564246,  0.80898876,  0.78651685])

In [134]:
features_train_rm, features_test_rm, target_train_rm, target_test_rm = train_test_split(
    features_rm_a, target_rm_a, test_size=0.25, random_state=0)

In [135]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_rm, target_train_rm)
target_predicted_proba = rf.predict_proba(features_test_rm)
plot_roc_curve(target_test_rm, target_predicted_proba)



In [136]:
rf_cv(features_rm_a, target_rm_a,random_state=0, scoring='roc_auc')


Random Forest CV scores:min: 0.812, mean: 0.849, max: 0.865
Out[136]:
array([ 0.81209615,  0.85655208,  0.86130765,  0.86458333])

Col Deletion strategy


In [137]:
# selecting index and transforming into numpy array 
features_cm_a, target_cm_a = features.drop('age', axis =1).values, target.values

In [138]:
features_cm_a.shape


Out[138]:
(891, 10)

In [139]:
rf_cv(features_cm_a, target_cm_a, scoring="accuracy")


Random Forest CV scores:min: 0.759, mean: 0.809, max: 0.833
Out[139]:
array([ 0.75892857,  0.8206278 ,  0.83333333,  0.82432432])

In [140]:
features_train_cm, features_test_cm, target_train_cm, target_test_cm = train_test_split(
    features_cm_a, target_cm_a, test_size=0.25, random_state=0)

In [141]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_cm, target_train_cm)
target_predicted_proba = rf.predict_proba(features_test_cm)
plot_roc_curve(target_test_cm, target_predicted_proba)



In [142]:
rf_cv(features_cm_a, target_cm_a, scoring="roc_auc")


Random Forest CV scores:min: 0.805, mean: 0.850, max: 0.880
Out[142]:
array([ 0.80535895,  0.88032592,  0.85169601,  0.86225848])

Median Imputation strategy


In [143]:
# selecting index and transforming into numpy array 

features_imp = features.copy()
features.shape


Out[143]:
(891, 11)

In [144]:
# features_imp.loc[:,'is_na_age'] =features_imp.age.isnull().astype(int)
# imp = NaImputer(features) # creating our imputer instance
# features_imp = imp.basic_naimputation(columns_to_process=['age'])

In [145]:
features_imp = features.fillna(-1)

In [146]:
features_imp_a, target_imp_a = features_imp.values, target.values

In [147]:
rf_cv(features_imp_a, target_imp_a, scoring='accuracy')


Random Forest CV scores:min: 0.763, mean: 0.804, max: 0.821
Out[147]:
array([ 0.76339286,  0.8206278 ,  0.81531532,  0.81531532])

In [148]:
features_train_imp, features_test_imp, target_train_imp, target_test_imp = train_test_split(
    features_imp_a, target_imp_a, test_size=0.25, random_state=0)

In [149]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_imp, target_train_imp)
target_predicted_proba = rf.predict_proba(features_test_imp)
plot_roc_curve(target_test_imp, target_predicted_proba)



In [150]:
rf_cv(features_imp_a, target_imp_a, scoring='roc_auc')


Random Forest CV scores:min: 0.819, mean: 0.852, max: 0.873
Out[150]:
array([ 0.81905123,  0.87260227,  0.85234006,  0.86225848])

In [151]:
rf.feature_importances_


Out[151]:
array([ 0.28061501,  0.2436398 ,  0.0967306 ,  0.06874228,  0.09025391,
        0.01820227,  0.11959495,  0.04632542,  0.01161832,  0.0097387 ,
        0.01453875])

Simulating missing values in a column and see imputation performance

The purpose of this section is to simulate missing values in a important variable like .. and see the decrease of performance and the ability to correcly impute the missing value

  • We start using the features_full dataset

In [152]:
# constructing features 
features_imp = pd.concat([titanic[['pclass']],
                      pd.get_dummies(titanic['sex'], prefix='sex'),
                      pd.get_dummies(titanic['who'], prefix='who')],axis=1)

In [153]:
features_imp.pclass.value_counts()


Out[153]:
3    491
1    216
2    184
Name: pclass, dtype: int64

Comparing simulated model and raw model


In [156]:
#scores_imp = logreg_cv(features_imp.drop('pclass',1), target)

In [174]:
def insert_na(features_full=features_imp,target=target,index=False,
              col_to_simulate='pclass', pct_na_toinsert=0.2, verbose=False):
    """ Returns dataset with a certain pct of na injected in one colum """
    nb_na_toinsert = int(pct_na_toinsert * len(features_full))
    index_na_toinsert = np.random.choice(range(len(features_full)),nb_na_toinsert, replace=False)
    if verbose:
        print("We are inserting {} missing values".format(len(index_na_toinsert)))
    features_full_imp = features_full.copy()
    if index :
        return index_na_toinsert
    else:
        features_full_imp.loc[index_na_toinsert, col_to_simulate] = np.nan
        return features_full_imp

def score_rf_sim(features_full=features_imp,target=target,
                 col_to_simulate='pclass', pct_na_toinsert=0.2, n_repeat=10,verbose=False, *args, **kwargs):
    """ Inserting a percentage of missing values on a variable and look influence on performance 
    with a random forest model """
    features_full_imp = insert_na(features_full,target=target,
                                  col_to_simulate=col_to_simulate, pct_na_toinsert=pct_na_toinsert,verbose=verbose)
    imp_f = NaImputer(features_full_imp)
    features_full_imp.loc[:,col_to_simulate] = imp_f.fillna_serie(colname=col_to_simulate)
    # repeated cross validation 
#     score_rcv = 0
#     for i in range(n_repeat):
#         score_rcv += logreg_cv(features_full_imp, target,*args, **kwargs).mean()
    return logreg_cv(features_full_imp, target, verbose=False).mean()

In [175]:
score_rf_sim(col_to_simulate='pclass', verbose=True)


We are inserting 178 missing values
Out[175]:
0.79793164500984004

In [195]:
accuracy_mean_pct_na = np.array([score_rf_sim(
            pct_na_toinsert=i,col_to_simulate='pclass',verbose=True) for i in np.linspace(0,0.98,10)])


We are inserting 0 missing values
We are inserting 97 missing values
We are inserting 194 missing values
We are inserting 291 missing values
We are inserting 388 missing values
We are inserting 485 missing values
We are inserting 582 missing values
We are inserting 679 missing values
We are inserting 776 missing values
We are inserting 873 missing values

In [179]:
def sim_nmc(nmc=60,n_interval=5, *args, **kwargs):
    
    res = np.zeros(n_interval)
    for i in range(nmc):
        res +=  np.array([score_rf_sim(
            pct_na_toinsert=i, *args, **kwargs) for i in np.linspace(0,0.98,n_interval)])
    return res/nmc

In [180]:
test = sim_nmc(nmc=30, n_interval=5)

In [181]:
test


Out[181]:
array([ 0.80017885,  0.79718608,  0.78924172,  0.78587207,  0.78358648])

In [182]:
np.linspace(0,0.98,5)


Out[182]:
array([ 0.   ,  0.245,  0.49 ,  0.735,  0.98 ])

In [183]:
plt.plot(np.linspace(0,0.98,5), test)
plt.title('Accuracy function of percentage of missing values inserted')


Out[183]:
<matplotlib.text.Text at 0x1170ab590>

Trying to predict missing values

We start with all features to have better prediciton power


In [184]:
features_pred = features_full.copy().drop_duplicates()
features_pred = features_pred.drop('age', axis = 1)

In [185]:
index_na = insert_na(col_to_simulate='pclass', pct_na_toinsert=0.2, index=True)

In [186]:
index_na = features_pred.index.isin(index_na)

In [187]:
features_pred.head()


Out[187]:
fare pclass sibsp parch sex_female sex_male who_child who_man who_woman alone_False alone_True embarked_C embarked_Q embarked_S
0 7.2500 3 1 0 0 1 0 1 0 1 0 0 0 1
1 71.2833 1 1 0 1 0 0 0 1 1 0 1 0 0
2 7.9250 3 0 0 1 0 0 0 1 0 1 0 0 1
3 53.1000 1 1 0 1 0 0 0 1 1 0 0 0 1
4 8.0500 3 0 0 0 1 0 1 0 0 1 0 0 1

In [188]:
target = features_pred.pclass
features_pred = features_pred.drop('pclass', axis = 1)

In [189]:
features_pred_train, target_pred_target = features_pred.loc[~index_na,:], target[~index_na]
features_pred_test, target_pred_test = features_pred.loc[index_na,:], target[index_na]

In [190]:
features_pred_train.shape


Out[190]:
(614, 13)

In [191]:
features_pred_test.head()


Out[191]:
fare sibsp parch sex_female sex_male who_child who_man who_woman alone_False alone_True embarked_C embarked_Q embarked_S
7 21.075 3 1 0 1 1 0 0 1 0 0 0 1
17 13.000 0 0 0 1 0 1 0 0 1 0 0 1
20 26.000 0 0 0 1 0 1 0 0 1 0 0 1
24 21.075 3 1 1 0 1 0 0 1 0 0 0 1
26 7.225 0 0 0 1 0 1 0 0 1 1 0 0

In [192]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_pred_train, target_pred_target)
target_predicted = rf.predict(features_pred_test)
target_predicted_proba = rf.predict_proba(features_pred_test)

In [194]:
print(classification_report(target_pred_test, target_predicted))


             precision    recall  f1-score   support

          1       0.97      0.92      0.94        37
          2       0.85      0.82      0.84        28
          3       0.93      0.96      0.94        79

avg / total       0.92      0.92      0.92       144